####################################################################
#***2019 descriptives
## Packages
# To install and open the R packages that you need for this code. 
need <- c('tidyverse','readstata13','lfe','glue','rdrobust', 'car', 'stargazer','arm', 'broom', 'ggplot2', 'dotwhisker', 'gridExtra', 'corrplot')
have <- need %in% rownames(installed.packages()) 
if(any(!have)) install.packages(need[!have]) 
invisible(lapply(need, library, character.only=T)) 

# Change path to whereever you place the models
# To set up the working directory. 
script_folder = dirname(rstudioapi::getSourceEditorContext()$path)
setwd(glue('{script_folder}'))
rm(list = ls())
setwd("../")

## load in RD data:
load("5prepdata/final_rd_data.RData")


#get descriptives for: 
# rd.data$ever_hle
# rd.data$ever_ranprim_h
# rd.data$ever_ranprim_s
# rd.data$ever_runhouse
# rd.data$ever_gen

rd.data %>% 
  group_by(state) %>% 
  summarize(total = n(), total_hle = sum(ever_hle)) %>% View


## get a bandwidth-limited dataset for description purposes:
rd.data$cutoff<-NULL
rd.data$bandwidth<-NULL
rd.data$kw<-NULL

y<-rdbwselect(rd.data$ever_runhouse, rd.data$victory_marg, p=1, c = 0,  kernel = "tri", bwselect="mserd", covs = rd.data$running_terms_served + rd.data$dem + rd.data$rep + rd.data$spc_elec + rd.data$term_length + rd.data$number_candidates + rd.data$year)
rd.data$bandwidth<-y$bws[1]
rm(y)

rd.data$cutoff <- 0
rd.data$kw <- 1-(abs(rd.data$cutoff-rd.data$victory_marg))/rd.data$bandwidth
rd.data$kw[rd.data$victory_marg>(rd.data$cutoff+rd.data$bandwidth) | rd.data$victory_marg<(rd.data$cutoff-rd.data$bandwidth)] <- 0


rd.bw <- rd.data %>% filter(kw > 0) %>% 
  group_by(state) %>% 
  summarize(total_inrd = n(), cong_leg_ratio = mean(cong_leg_ratio), mean_mds1 = mean(mds1, na.rm=T))

rd.bw$mean_mds1<-(rd.bw$mean_mds1-mean(rd.bw$mean_mds1, na.rm=TRUE))/sd(rd.bw$mean_mds1, na.rm=TRUE)


propdf <- rd.data %>% 
  group_by(state) %>% 
  summarize(total = n(), total_hle = sum(ever_hle)) %>% 
  left_join(rd.bw) %>% 
  ungroup() %>% 
  mutate(prop = total/sum(total), prop_inrd = total_inrd/sum(total_inrd)) %>% 
  filter(state!="Louisiana")

model <- lm(propdf$prop_inrd ~ propdf$prop)
summary(model)
linearHypothesis(model, "propdf$prop = 1")

propdf %>% 
  left_join(data.frame(state = state.name, state.abb)) %>% 
  ggplot(aes(x = prop, y = prop_inrd, label=state.abb)) + 
  geom_point() + 
  geom_smooth(method="lm") + 
 # geom_abline(slope=1, color="red", linetype="dashed") +
  geom_line(data = data.frame(x1 = seq(min(propdf$prop), max(propdf$prop), length.out=50), y1 = seq(min(propdf$prop_inrd), max(propdf$prop_inrd), length.out=50)), aes(x=x1, y=y1), inherit.aes = F, color="red", linetype="dashed") +
  geom_text(hjust=-.3) +
  theme_minimal() +
  xlab("Share of All Elections") + ylab("Share in RD Sample") +
  theme_bw() 
  ggsave(file = "7tex/manuscript/tables/sourcefiles/Appendix Figure 1.pdf", units="in", width=6, height=4)


# 
# #drop conf interval and add red dashed line for x=y
# install.packages()
# 
# model <- lm(y ~ x)
# 
# linearHypothesis(model, "x = 1")


#### what happens to losing candidates? ####
load("5prepdata/final_rd_data.RData")
load("5prepdata/final_rd_data_with_opp.RData")


data.orig$flag_1_runhouse <- data.orig$opp_1_runhouse
data.orig$flag_2_runhouse <- data.orig$opp_2_runhouse - data.orig$opp_1_runhouse
data.orig$flag_3_runhouse <- data.orig$opp_3_runhouse - data.orig$opp_2_runhouse
data.orig$flag_4_runhouse <- data.orig$opp_4_runhouse - data.orig$opp_3_runhouse
data.orig$flag_5_runhouse <- data.orig$opp_5_runhouse - data.orig$opp_4_runhouse
data.orig$flag_6_runhouse <- data.orig$opp_6_runhouse - data.orig$opp_5_runhouse
data.orig$flag_7_runhouse <- data.orig$opp_7_runhouse - data.orig$opp_6_runhouse
data.orig$flag_8_runhouse <- data.orig$opp_8_runhouse - data.orig$opp_7_runhouse
data.orig$flag_9_runhouse <- data.orig$opp_9_runhouse - data.orig$opp_8_runhouse
data.orig$flag_10_runhouse <- data.orig$opp_10_runhouse - data.orig$opp_9_runhouse
data.orig$flag_11_runhouse <- data.orig$opp_11_runhouse - data.orig$opp_10_runhouse
data.orig$flag_12_runhouse <- data.orig$opp_12_runhouse - data.orig$opp_11_runhouse
data.orig$flag_13_runhouse <- data.orig$opp_13_runhouse - data.orig$opp_12_runhouse
data.orig$flag_14_runhouse <- data.orig$opp_14_runhouse - data.orig$opp_13_runhouse
data.orig$flag_15_runhouse <- data.orig$opp_15_runhouse - data.orig$opp_14_runhouse


#### state by professionalism category ####
#prof.rank,
# expend.rank,
# salary.rank,
# slength.rank

# probably turn this into a table for the appendix
rd.data %>% 
  ungroup %>% 
  distinct(state, .keep_all=T) %>% 
  group_by(state) %>% 
  arrange(year) %>% 
  filter(row_number()==n()) %>% # this gets the last observation for each state, so the most recent categorization
  dplyr::select(state, prof.rank.cat:slength.rank.cat) %>% 
  arrange(state)

rd.data %>% 
  ungroup %>% 
  group_by(state) %>% 
  distinct(decade, .keep_all=T) %>%
  dplyr::select(state, decade, prof.rank.cat:slength.rank.cat) %>% 
  filter(decade!=1960) %>% 
  group_by(state) %>% 
  arrange(decade) %>% 
  mutate(prof.dif = ifelse(lag(prof.rank.cat) != prof.rank.cat, 1, 0)) %>%
  ungroup %>% 
  group_by(state) %>% 
  dplyr::select(-decade) %>% 
  summarize_at(vars(prof.rank.cat:slength.rank.cat), .funs = funs(paste(unique(sort(.)), collapse=" / "))) %>% 
  rename(Professionalism = prof.rank.cat, Expenditures = expend.rank.cat, Salary = salary.rank.cat, 'Session Length' = slength.rank.cat) %>% 
  stargazer(summary=F, rownames=F, out="7tex/manuscript/tables/sourcefiles/Appendix Table 19.tex")


#%>% filter(prof.dif==1) %>% View
#28 out of 196 observations change categories of professionalism at least once
# some change more than once

# put med / high if state changed



#### summary stats table ####

rd.data$cutoff<-NULL
rd.data$bandwidth<-NULL
rd.data$kw<-NULL

y<-rdbwselect(rd.data$ever_runhouse, rd.data$victory_marg, p=1, c = 0,  kernel = "tri", bwselect="mserd", covs = rd.data$running_terms_served + rd.data$dem + rd.data$rep + rd.data$spc_elec + rd.data$term_length + rd.data$number_candidates + rd.data$year)
rd.data$bandwidth<-y$bws[1]
rm(y)

rd.data$cutoff <- 0
rd.data$kw <- 1-(abs(rd.data$cutoff-rd.data$victory_marg))/rd.data$bandwidth
rd.data$kw[rd.data$victory_marg>(rd.data$cutoff+rd.data$bandwidth) | rd.data$victory_marg<(rd.data$cutoff-rd.data$bandwidth)] <- 0

## THIS PRODUCES TABLE 1
rd.data %>% 
  ungroup %>% 
  filter(kw>0) %>% 
  dplyr::select(`Won Election` = won_election, `Victory Margin` = victory_marg, `Democrat` = dem, `Terms Served` = running_terms_served, `Term Length` = term_length, `Number of candidates in election` = number_candidates, `Ever run for Cong. prim or gen.` = ever_hle, `Run in House Primary` = ever_ranprim_h, `Run in Senate Primary` = ever_ranprim_s, `Run for House` = ever_runhouse, `Run in General Election` = ever_gen,
                Professionalism = mds1, `Session Length (days)` = slength, `Salary (000s)` = salary_real, Expenditures = expend, `State Congressional delegation size` = del.size) %>% 
  as.data.frame() %>% 
  stargazer(omit.summary.stat = c("p25", "p75"), digits=1, digits.extra=1, label = "tab:sum_stats", out = "7tex/manuscript/tables/sourcefiles/Table 1.tex",
            notes = "\\parbox[t]{\\linewidth}{Note: Table presents summary statistics of politicians' state legislature and later Congressional election outcomes. The sample includes all contested elections within the optimal bandwidth for the ``ever ran in House general election'' outcome.}"
            )
